import requests
import re

# 保存文件
f = open("top250.csv", mode="w", encoding='utf-8')

url = 'https://movie.douban.com/top250'
heasers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
start = 0
while start <= 250:
    params = {
        "start": start
    }

    # 如何翻页提取
    resp = requests.get(url, headers=heasers, params=params)
    # resp.encoding = 'utf-8'  #解决乱码
    # print(resp.text)
    print(resp.request.url)
    pageSource = resp.text
    # 编写正则表达式
    # re.S 可以让正则表达式中，匹配换行府
    obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<p class="">'
                     r'.*?导演: (?P<dao>.*?)&nbsp;.*?<br>(?P<year>.*?)&nbsp;'
                     r'.*?<span class="rating_num" property="v:average">'
                     r'(?P<score>.*?)</span>.*?<span>(?P<nums>.*?)人评价</span>', re.S)
    # 匹配正则
    result = obj.finditer(pageSource)

    for item in result:
        name = item.group("name")
        dao = item.group("dao")
        year = item.group("year").strip()  # 去掉字符串两端的空白
        score = item.group("score")
        nums = item.group("nums")
        f.write(f"{name},{dao},{year},{score},{nums}\n") # 可以使用csv模块

    start += 25

f.close()
resp.close()
print("top250提取完毕")
